"""
自定义文档解析器，生成Document对象
"""
import pandas as pd
from langchain_core.document_loaders import BaseLoader
from langchain_core.documents import Document

"""
读取excel文件，并转换为Document对象列表,page_content里保存json格式数据
"""
class CustomExcelLoader(BaseLoader):
    def __init__(self, file_path: str):
        self.file_path = file_path

    def load(self) -> list[Document]:
        data = pd.read_excel(self.file_path, engine='openpyxl')
        json_content = data.to_json(orient='records', force_ascii=False)
        document = Document(
            page_content=json_content,
            metadata={"source": self.file_path}
        )
        return [document]

file_path = "../data/document/test.xlsx"
loader = CustomExcelLoader(file_path=file_path)
documents = loader.load()
print(documents[0].page_content)
# 输出：[{"id":1,"name":"avv1","age":18},{"id":2,"name":"avv2","age":19},{"id":3,"name":"avv3","age":20},{"id":4,"name":"avv4","age":21},{"id":5,"name":"avv5","age":22}]